In [2]:
import ghgmosh as gm
import datavis as dv
reload(gm)
Out[2]:
In [4]:
scope1 = gm.getscope_allyears(1)
scope2 = gm.getscope_allyears(2)
In [11]:
# indexed by Organisation and year
scopes12 = scope1.join(scope2[['scope2']], how='outer')
scopes12.to_pickle("../CDPdata/2010to2014scopes1or2.pkl")
In [12]:
scopes12 = pd.read_pickle("../CDPdata/2010to2014scopes1or2.pkl")
In [13]:
s = scopes12
scopes12_complete = s[(s["scope2"].isnull()==False) & (s["scope1"].isnull()==False)].reset_index().set_index(["Organisation", "year"])
scopes12_incomplete = s[(s["scope2"].isnull()==True) | (s["scope1"].isnull()==True)].reset_index().set_index(["Organisation", "year"])
In [21]:
# so most companies that report scope 1 also report scope 2
len(scopes12) # 8023
len(scopes12.index.levels[0]) # 2727
s12_cos = scopes12.index.levels[0]
len(scopes12_complete.index.levels[0]) # 2646
s12_completecos = scopes12_complete.index.levels[0]
len(scopes12_complete) # 7787
incomplete_cos = set(s12_cos).difference(set(s12_completecos))
In [22]:
# some nans are b/c they say the emissions are not relevant (like for Alliant and Duke Energy)
# but some are b/c they say they're relevant but not calculated
# relevance is on a different sheet, can just use 2014 data
excuses = pd.read_pickle("../CDPdata/excuses_s12.pkl")
In [23]:
good_excuses = ["Emissions are not relevant", "No emissions from this source",
"No emissions excluded", "Emissions excluded due to a recent acquisition"]
excuses[excuses.columns[1]].value_counts().index
Out[23]:
In [24]:
excuses.drop_duplicates(cols="Organisation", inplace=True)
excuses["scope1 null"] = excuses[excuses.columns[1]].apply(lambda x: 0 if str(x) in good_excuses else float("nan"))
excuses["scope2 null"] = excuses[excuses.columns[2]].apply(lambda x: 0 if str(x) in good_excuses else float("nan"))
In [25]:
scopes12_incomplete = scopes12_incomplete.reset_index().set_index("Organisation")
excuses.set_index("Organisation", inplace=True)
In [26]:
incomplete_excuses = scopes12_incomplete.join(excuses[["scope1 null", "scope2 null"]])
complete_reasons = incomplete_excuses[(incomplete_excuses["scope1 null"].isnull()==False) & (incomplete_excuses["scope2 null"].isnull()==False)]
In [27]:
s12_addback = complete_reasons.drop(["scope1 null", "scope2 null"],1)
s12_addback["scope1"] = s12_addback["scope1"].fillna(0)
s12_addback["scope2"] = s12_addback["scope2"].fillna(0)
In [28]:
s12_addback = s12_addback.reset_index().set_index(["Organisation", "year"])
In [29]:
# more cleanup
s = pd.concat([s12_addback, scopes12_complete])
s12_completed = s[(s["Country"].isnull()==False) & (s["GICS Industry"].isnull()==False)]
In [30]:
s12_companies = s12_completed.index.levels[0]
len(s12_companies) #2648 yay
Out[30]:
In [2]:
s12_completed = pd.read_pickle("../CDPdata/s12_completed.pkl")
In [31]:
s12_completed.to_pickle("../CDPdata/s12_completed.pkl")
s12_completed.head()
Out[31]:
In [26]:
# get all 5 years for balanced panel
s = scopes12_complete.reset_index()
s12_count = s[s["year"].isin(range(2009,2014))].groupby("Organisation").count()
s12_companies = s12_count[s12_count["year"]==5].index.tolist()
In [16]:
scopes12_complete5 = scopes12_complete[scopes12_complete.index.isin(s12_companies,level=0)].reset_index().set_index(["Organisation", "year"])
In [18]:
len(s12_companies) # 643
len(scopes12_complete5) # 3308 has extra because some companies also have 2009 numbers
scopes12_complete5.to_pickle("../CDPdata/2010to2014scopes12fiveyrs.pkl")
scopes12_complete.to_pickle("../CDPdata/2010to2014scopes12.pkl")
In [32]:
# len(s12_companies)
orginfos = pd.read_pickle("../CDPdata/orginfos.pkl")
In [33]:
p = pd.read_pickle("../CDPdata/sheet35_2014.pkl")
orginfos = p[["Organisation"] + p.columns[2:7].tolist()+["Ticker", "ISIN"]]
orginfos = orginfos.drop_duplicates("Organisation").set_index("Organisation")
orginfos.to_pickle("../CDPdata/orginfos.pkl")
len(orginfos)
Out[33]:
In [34]:
s12_company_ids = orginfos[orginfos.index.isin(s12_companies)][["ISIN", "Ticker", "Country"]]
s12_company_ids["Ticker"] = s12_company_ids["Ticker"].apply(lambda(x): str(x).split(" ")[0])
len(s12_company_ids) # 1732
Out[34]:
In [35]:
americans = s12_company_ids[s12_company_ids["Country"]=="USA"][["Ticker"]]
americans=americans[americans["Ticker"]!="nan"]
americans.to_excel("../CDPdata/s12_american_ids.xlsx")
len(americans) # 381
Out[35]:
In [17]:
world = s12_company_ids[s12_company_ids["Country"]!="USA"][["ISIN"]]
world=world[world["ISIN"].isnull()==False]
world.to_excel("../CDPdata/s12_world_ids.xlsx")
len(world) # 1264
Out[17]:
In [286]:
p = pd.read_pickle("../CDPdata/sheet68_2014.pkl")
p.columns
Out[286]:
In [163]:
N = 20
categories = ['y' + str(x) for x in range(10)]
data = {}
data['x'] = np.arange(N)
for cat in categories:
data[cat] = np.random.randint(10, 100, size=N)
x2 = np.hstack((data['x'][::-1], data['x']))
In [164]:
df = pd.DataFrame(data)
df = df.set_index(['x'])
areas = dv.stacked_cols(df, categories)
In [167]:
p = figure()
p.patches([x2 for a in areas], list(areas.values()), color=colors, alpha=0.8, line_color=None)
Out[167]:
In [ ]: